pytorch - nn

Lecture 18

Dr. Colin Rundel

Odds & Ends

Torch models

Implementation details:

  • Models are implemented as a class inheriting from torch.nn.Module

  • Must implement constructor and forward() method

    • __init__() should call parent constructor via super()

      • Use torch.nn.Parameter() to indicate model parameters
    • forward() should implement the model - constants + parameters -> return predictions

Fitting proceedure:

  • For each iteration of solver:

    • Get current predictions via a call to forward() or equivalent.

    • Calculate a (scalar) loss or equivalent

    • Call backward() method on loss

    • Use built-in optimizer (step() and then zero_grad() if necessary)

From last time

class Model(torch.nn.Module):
    def __init__(self, X, y, beta=None):
        super().__init__()
        self.X = X
        self.y = y
        if beta is None:
          beta = torch.zeros(X.shape[1])
        beta.requires_grad = True
        self.beta = torch.nn.Parameter(beta)
        
    def forward(self, X):
        return X @ self.beta
    
    def fit(self, opt, n=1000, loss_fn = torch.nn.MSELoss()):
      losses = []
      for i in range(n):
          loss = loss_fn(
            self(self.X).squeeze(), 
            self.y.squeeze()
          )
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

What is self(self.X)?

This is (mostly) just short hand for calling self.forward(X) to generate the output tensors from the current value(s) of the parameters.

This is done via the __call__() method in the torch.nn.Module class. __call__() allows python classes to be invoked like functions.


class greet:
  def __init__(self, greeting):
    self.greeting = greeting
  def __call__(self, name):
    return self.greeting + " " + name
hello = greet("Hello")
hello("Jane")
'Hello Jane'
gm = greet("Good morning")
gm("Bob")
'Good morning Bob'

MNIST & Logistic models

MNIST handwritten digits - simplified

from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
X.shape
(1797, 64)
X[0:2]
array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,
         0.,  0.,  0., 13., 15., 10., 15.,
         5.,  0.,  0.,  3., 15.,  2.,  0.,
        11.,  8.,  0.,  0.,  4., 12.,  0.,
         0.,  8.,  8.,  0.,  0.,  5.,  8.,
         0.,  0.,  9.,  8.,  0.,  0.,  4.,
        11.,  0.,  1., 12.,  7.,  0.,  0.,
         2., 14.,  5., 10., 12.,  0.,  0.,
         0.,  0.,  6., 13., 10.,  0.,  0.,
         0.],
       [ 0.,  0.,  0., 12., 13.,  5.,  0.,
         0.,  0.,  0.,  0., 11., 16.,  9.,
         0.,  0.,  0.,  0.,  3., 15., 16.,
         6.,  0.,  0.,  0.,  7., 15., 16.,
        16.,  2.,  0.,  0.,  0.,  0.,  1.,
        16., 16.,  3.,  0.,  0.,  0.,  0.,
         1., 16., 16.,  6.,  0.,  0.,  0.,
         0.,  1., 16., 16.,  6.,  0.,  0.,
         0.,  0.,  0., 11., 16., 10.,  0.,
         0.]])
y = digits.target
y.shape
(1797,)
y[0:10]
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

Example digits

Test train split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, shuffle=True, random_state=1234
)
X_train.shape
(1437, 64)
y_train.shape
(1437,)
X_test.shape
(360, 64)
y_test.shape
(360,)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression(
  penalty=None
).fit(
  X_train, y_train
)
accuracy_score(y_train, lr.predict(X_train))
1.0
accuracy_score(y_test, lr.predict(X_test))
0.9583333333333334

As Torch tensors

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test).float()
y_test = torch.from_numpy(y_test)
X_train.shape
torch.Size([1437, 64])
y_train.shape
torch.Size([1437])
X_test.shape
torch.Size([360, 64])
y_test.shape
torch.Size([360])
X_train.dtype
torch.float32
y_train.dtype
torch.int64
X_test.dtype
torch.float32
y_test.dtype
torch.int64

PyTorch Model

class mnist_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.beta = torch.nn.Parameter(
          torch.randn(input_dim, output_dim, requires_grad=True)  
        )
        self.intercept = torch.nn.Parameter(
          torch.randn(output_dim, requires_grad=True)  
        )
        
    def forward(self, X):
        return (X @ self.beta + self.intercept).squeeze()
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses = []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
      
      return losses

Cross entropy loss

model = mnist_model(64, 10)
l = model.fit(X_train, y_train, X_test, y_test)

Cross entropy loss

From the pytorch documentation:

\[ \ell(x, y)=L=\left\{l_1, \ldots, l_N\right\}^{\top}, \quad l_n=-w_{y_n} \log \frac{\exp \left(x_{n, y_n}\right)}{\sum_{c=1}^C \exp \left(x_{n, c}\right)} \]

\[ \ell(x, y)= \begin{cases}\sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot 1\left\{y_n \neq \text { ignore_index }\right\}} l_n, & \text { if reduction }=\text { 'mean' } \\ \sum_{n=1}^N l_n, & \text { if reduction }=\text { 'sum' }\end{cases} \]

Out-of-sample accuracy

model(X_test)
tensor([[ 28.3264, -33.8749,  -0.1270,
          32.7894, -10.8112,   8.8204,
          -6.9321,  50.0344,  17.1113,
          14.7006],
        [  4.6819,  83.2354,   0.8541,
          56.6228, -11.0546, -18.0684,
          40.1415,  -1.1687,  30.8871,
          73.6600],
        [ 25.8626,  -8.2121,   2.2649,
          20.2590, -19.4369, -31.6355,
         -24.8725,  83.7362,  16.6910,
           9.7738],
        [ 58.6064,   8.9680,  -2.1831,
         -28.0751,  16.9030,  24.5250,
         120.5182, -34.8430,  57.8929,
         -34.2762],
        [ 89.7358,  28.3815,   2.2295,
          -4.6457,  54.6775,  21.6684,
          43.3220,  13.7041,  55.7687,
          17.3999],
        [-41.4654,  43.9150,  88.4240,
          41.4128,  20.8847, -20.8619,
         -40.2312, -25.3154,  -5.7674,
          33.8998],
        [ 53.3678,  -1.4502,  10.7946,
         -30.4964,  98.3135,  32.0661,
          59.4639,   1.5477,  18.2795,
         -39.7249],
        [  4.7578,  -6.8503,  12.1940,
          75.4213, -54.7117,  18.7842,
           4.4492, -12.1986,  46.7002,
          42.1572],
        [ 11.4094,  32.6730, -19.9298,
          16.3188,   0.1515,  19.5431,
         101.1222,  10.7282,  47.8819,
         -12.6428],
        [-12.2856,  45.6837,  16.8624,
          87.6534, -11.1386, -24.4981,
         -21.7683,  14.3556,  46.9750,
          20.9265],
        [ -3.3379, -24.4710,  15.6235,
          22.2594,   2.2616, -12.2347,
          30.2127,  44.5220,  38.3058,
         -27.2785],
        [-37.7582,  56.0715,  36.6001,
           9.7402,  46.1727,   2.3972,
           8.6747,  -1.4875,  72.3353,
          31.6717],
        [ -4.2163,  -5.0143, -20.8586,
           6.4018,  -9.2674,  19.7805,
          15.1361,  56.8558,   9.5821,
         -19.7090],
        [ 12.4682,  18.1218,  28.2125,
          41.2046, -31.7922,  -0.8101,
          -7.0698, -12.6251,  42.1018,
          98.9157],
        [ -7.7189,  57.6890,   4.6894,
          20.6092, 118.6271,   8.0751,
          48.9944,  33.8145,  50.8630,
          -1.5216],
        [-21.7930,  19.2152,   0.4937,
          69.4961, -57.4631,  19.8917,
         -15.4695,  17.8730,  52.8237,
          46.8817],
        [  9.0583,  22.1341,  24.9855,
          58.2883,  40.7852,  -8.7757,
           8.8961,  12.3769,  64.8430,
          18.0931],
        [ -1.1408, -40.7357,  -5.7575,
          17.0949, -36.8892,   6.2907,
          11.3483,  46.3880,  20.1897,
          -8.9348],
        [-15.5972,  42.9917,  16.2804,
          35.9640,  15.9520,  53.2328,
           7.1783,  -5.2868, 102.9414,
          39.6916],
        [ 43.6112,   2.3475, -19.6114,
          -2.7892, 121.5067,  -9.7285,
         103.3099,  38.0728,  56.0547,
          -6.8426],
        [106.2917, -33.4353,  70.4753,
          15.7874,  20.2558,  18.9655,
         -11.9787, -25.3363,  34.3515,
          24.7798],
        [  1.7341, -33.7498,  -5.4810,
          60.0556, -64.0863,   5.6623,
          11.3748,  -7.7319,  29.6674,
          52.9160],
        [ 18.2223,  -1.2887, -25.0613,
          53.5578, -38.8156,  36.8916,
           8.6201,  16.1522,  35.3990,
          93.1349],
        [-68.0700,  53.8436,  31.8564,
          50.5692,  32.5796,  30.0426,
          20.6423, -38.6212,  33.4638,
          54.4955],
        [ -3.5629,  10.3181,  26.1257,
          51.0479, -36.0236,  11.9486,
         -36.7106,  12.5451,   1.1457,
          12.9942],
        [  4.1050,  14.9849,  -5.3918,
           1.7129,   4.9896,  18.7707,
         115.5077,   5.8887,  54.7871,
          16.3823],
        [ 31.2584,  12.3219,  13.0425,
         -20.8350,   0.2823,  23.7863,
         125.8488, -43.4578,  48.8728,
         -38.6819],
        [ 87.8901, -33.2534,  40.0341,
          19.5798, -25.5586,  15.4357,
          27.1454, -14.8737,  54.4798,
          55.4276],
        [  6.3564,  -1.4427, -20.6713,
          15.0399,  26.4667,  47.3310,
          12.7007, -16.0399, -18.2086,
          27.7155],
        [ 59.4582,   2.4948,  -0.5765,
         -32.4423, 118.2608,  -0.3615,
          69.1639,  21.0575,  23.7075,
         -53.0873],
        ...,
        [-52.6950,  16.5224,   3.3994,
          66.6007, -53.5948,  21.4360,
          10.8065,   5.3819,  51.7721,
          25.8891],
        [ 28.8981,  16.7283,  -6.4411,
         -28.8951, 114.7280, -32.6722,
          92.1893,  52.8203,  51.6249,
         -27.5617],
        [-75.6711, 102.6339,  61.1676,
          42.0912,  46.7820, -12.3524,
         -38.3667,  35.3638,  29.5558,
           1.0318],
        [ 40.1746,  35.0766,  18.4347,
         -11.5654,  58.6465,  32.7927,
          15.0512,  19.9137,  70.9658,
         -12.5549],
        [-65.4245,  76.8906,  36.6000,
          38.9781,  24.5722,  -8.5765,
         -36.8578,  44.8127,  20.0608,
          10.6776],
        [  5.0717,  10.4047,  -1.8503,
          17.3711, -23.2989,  18.8061,
          33.2140,   0.3008,  76.2584,
          50.2001],
        [-28.8018, -21.8400, -31.3073,
          29.4361, -19.1795,  95.8336,
         -32.2080,  22.9108,  25.5249,
          -3.8982],
        [113.4053, -52.5790,  51.4473,
          29.1683, -11.2745,   8.0376,
          17.1120,  -7.4224,  29.4368,
          30.0770],
        [  3.4072,  -9.1898, -28.8616,
           4.4138,   9.6609,  14.1066,
          21.3664,  19.8539,  18.0752,
          37.3372],
        [-32.7928,  15.2876,  92.4384,
          27.4158, -48.9109, -40.8599,
         -17.6459, -27.9494, -15.8311,
         -11.2865],
        [ 18.1725, -18.3929, -13.1424,
          19.0417,  52.2877, -34.1149,
          51.0878,  66.4996,  10.8474,
         -22.6632],
        [-45.5243,  23.5030,  91.7076,
          38.8475, -30.1901, -16.9102,
         -42.4284,  -5.0448,   8.8544,
          47.4572],
        [  0.8648, -10.8119,  36.4522,
          53.0029, -70.4814,   3.8280,
           3.6857, -20.8688,  19.1112,
          33.0244],
        [-27.1591, -17.7183, -19.3732,
           0.1840, -30.6488,  77.9879,
         -60.1883,  35.0241,  33.5656,
         -10.5340],
        [-29.8915,  29.2159,  76.7046,
          18.1899, -12.8748, -14.6175,
         -27.2669, -13.9063,  -7.8565,
          10.7232],
        [-14.2020,  13.3031,  -0.3858,
          26.4320,  15.3615,  19.6714,
         120.5599,  -4.7591,  60.6108,
          32.8546],
        [-41.2355,   1.2386,  13.2733,
          69.2803, -35.0130,  13.0925,
           0.4840,   2.5280,  31.9485,
          28.0139],
        [  0.7518,  20.6091,  10.2568,
           9.6093,  63.5842,  -4.1978,
          -6.0666,  38.7050,  21.1222,
           7.3283],
        [-75.5437,  87.8186,  32.2694,
          51.6267,  25.7083,  -7.6776,
         -43.4048,  59.0556,  24.8164,
           8.2007],
        [-18.9240,  15.0053, -48.5934,
          21.1889,  26.0520,  95.9230,
           5.1014,  -9.3966,   7.5833,
          26.7499],
        [100.2244, -32.2871,  33.4781,
           9.7303,  -5.9504,  14.3187,
          28.8840, -28.5904,  37.8214,
          38.7900],
        [-66.8305,  -6.1832,   1.8715,
           0.7765, -22.0353,  64.8688,
         -69.5829,  27.6665,  28.2962,
          19.5375],
        [ 29.1693,  33.3832,   9.2382,
          23.8065,  87.1511,   2.0488,
          -7.7231,  42.8264,  23.3335,
          21.1437],
        [ 37.1253,  -2.1955,   8.9633,
         -19.0492,   9.7692,  18.2531,
         104.6660, -10.6635,  45.3759,
         -45.7821],
        [-18.4551,  10.1711,  -6.1694,
          74.8079, -33.0756,  19.7801,
          15.1397,  -1.1632,  25.2887,
          43.6638],
        [-20.0854,  25.2533,  55.6692,
          36.0976, -13.0006,  11.1800,
         -10.5839,   2.4158,  25.1964,
          32.5471],
        [ -1.7874, -26.0621, -78.0909,
          42.5109, -48.5792, 109.6042,
          18.2272,  22.2302,  16.5535,
           3.2358],
        [ 66.3632,  49.9809,  15.7131,
           0.2292,  37.8900,  42.7770,
          22.2845, -21.6954,  43.1089,
          52.6583],
        [  8.9369,  27.8989, -17.8641,
          16.4638,  13.1202,  10.0407,
         -21.7885,  54.2365,   7.7730,
          35.3451],
        [-57.5059,  35.7301,  16.7545,
          68.4385, -56.1158,  10.5610,
          15.1933,  -4.3972,  40.8620,
          17.0347]],
       grad_fn=<SqueezeBackward0>)
val, index = torch.max(model(X_test), dim=1)
index
tensor([7, 1, 7, 6, 0, 2, 4, 3, 6, 3, 7, 8, 7,
        9, 4, 3, 8, 7, 8, 4, 0, 3, 9, 9, 3, 6,
        6, 0, 5, 4, 1, 2, 1, 2, 3, 2, 7, 6, 5,
        8, 6, 4, 4, 0, 9, 2, 8, 5, 4, 4, 4, 1,
        7, 6, 8, 2, 9, 9, 9, 0, 1, 3, 1, 8, 8,
        8, 3, 9, 1, 3, 9, 6, 9, 5, 8, 1, 9, 2,
        1, 3, 8, 7, 3, 3, 8, 7, 8, 5, 8, 2, 6,
        1, 9, 1, 6, 4, 5, 2, 2, 4, 5, 4, 3, 6,
        5, 7, 2, 4, 1, 0, 7, 6, 1, 2, 9, 5, 2,
        5, 0, 3, 2, 7, 6, 4, 8, 2, 1, 1, 6, 9,
        6, 8, 2, 4, 7, 5, 0, 9, 1, 0, 5, 6, 7,
        6, 3, 1, 3, 2, 0, 4, 4, 3, 5, 4, 6, 1,
        1, 9, 6, 2, 7, 9, 0, 7, 9, 5, 4, 1, 3,
        8, 6, 4, 7, 1, 5, 7, 4, 7, 4, 3, 2, 2,
        1, 1, 4, 4, 3, 5, 5, 9, 4, 5, 5, 9, 3,
        9, 6, 1, 2, 0, 8, 2, 9, 9, 2, 4, 6, 8,
        3, 8, 1, 0, 8, 1, 8, 5, 6, 8, 7, 1, 8,
        0, 4, 8, 7, 0, 5, 5, 6, 1, 3, 0, 5, 8,
        2, 0, 9, 3, 6, 7, 8, 4, 1, 0, 5, 2, 5,
        1, 6, 4, 7, 1, 2, 6, 4, 4, 6, 3, 2, 3,
        2, 6, 5, 2, 5, 8, 7, 0, 1, 0, 4, 8, 1,
        2, 7, 9, 8, 5, 9, 5, 7, 0, 4, 8, 4, 9,
        4, 0, 7, 0, 7, 5, 3, 5, 3, 5, 7, 9, 8,
        2, 7, 0, 1, 9, 1, 7, 9, 8, 5, 0, 2, 0,
        8, 7, 0, 9, 5, 5, 9, 6, 1, 2, 3, 9, 8,
        3, 2, 9, 3, 4, 3, 4, 1, 8, 1, 8, 5, 0,
        9, 2, 7, 2, 3, 5, 2, 6, 3, 4, 1, 5, 0,
        5, 4, 6, 3, 2, 5, 0, 7, 3])
(index == y_test).sum()
tensor(324)
(index == y_test).sum() / len(y_test)
tensor(0.9000)

Calculating Accuracy

class mnist_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.beta = torch.nn.Parameter(
          torch.randn(input_dim, output_dim, requires_grad=True)  
        )
        self.intercept = torch.nn.Parameter(
          torch.randn(output_dim, requires_grad=True)  
        )
        
    def forward(self, X):
        return (X @ self.beta + self.intercept).squeeze()
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance

loss, train_acc, test_acc = mnist_model(
  64, 10
).fit(
  X_train, y_train, X_test, y_test, acc_step=10, n=3000
)

NN Layers

class mnist_nn_model(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
        
    def forward(self, X):
        return self.linear(X)
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

NN linear layer

Applies a linear transform to the incoming data (\(X\)): \[y = X A^T+b\]

X.shape
(1797, 64)
model = mnist_nn_model(64, 10)
model.parameters()
<generator object Module.parameters at 0x3281919a0>
list(model.parameters())[0].shape  # A - weights (betas)
torch.Size([10, 64])
list(model.parameters())[1].shape  # b - bias
torch.Size([10])

Performance

loss, train_acc, test_acc = model.fit(X_train, y_train, X_test, y_test, n=1000)
train_acc[-5:]
[tensor(0.9916), tensor(0.9916), tensor(0.9923), tensor(0.9923), tensor(0.9923)]
test_acc[-5:]
[tensor(0.9611), tensor(0.9611), tensor(0.9611), tensor(0.9611), tensor(0.9611)]

Feedforward Neural Network

FNN Model

class mnist_fnn_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl(out)
        out = self.l2(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Non-linear activation functions

\[\text{Tanh}(x) = \frac{\exp(x)-\exp(-x)}{\exp(x) + \exp(-x)}\]

\[\text{ReLU}(x) = \max(0,x)\]

Model parameters

model = mnist_fnn_model(64,64,10)
len(list(model.parameters()))
4
for i, p in enumerate(model.parameters()):
  print("Param", i, p.shape)
Param 0 torch.Size([64, 64])
Param 1 torch.Size([64])
Param 2 torch.Size([10, 64])
Param 3 torch.Size([10])

Performance - ReLU

loss, train_acc, test_acc = mnist_fnn_model(64,64,10).fit(
  X_train, y_train, X_test, y_test, n=2000
)
train_acc[-5:]
[0.9986082115518441, 0.9986082115518441, 0.9986082115518441, 0.9986082115518441, 0.9986082115518441]
test_acc[-5:]
[0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444]

Performance - tanh

loss, train_acc, test_acc = mnist_fnn_model(64,64,10, nl_step=torch.nn.Tanh()).fit(
  X_train, y_train, X_test, y_test, n=2000
)
train_acc[-5:]
[0.9951287404314544, 0.9951287404314544, 0.9951287404314544, 0.9951287404314544, 0.9951287404314544]
test_acc[-5:]
[0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444]

Adding another layer

class mnist_fnn2_model(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, nl_step = torch.nn.ReLU(), seed=1234):
        super().__init__()
        self.l1 = torch.nn.Linear(input_dim, hidden_dim)
        self.nl = nl_step
        self.l2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.nl = nl_step
        self.l3 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, X):
        out = self.l1(X)
        out = self.nl(out)
        out = self.l2(out)
        out = self.nl(out)
        out = self.l3(out)
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      loss_fn = torch.nn.CrossEntropyLoss()
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = loss_fn(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance - relu

loss, train_acc, test_acc = mnist_fnn2_model(
  64,64,10, nl_step=torch.nn.ReLU()
).fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.9923451635351427, 0.9930410577592206, 0.9930410577592206, 0.9930410577592206, 0.9930410577592206]
test_acc[-5:]
[0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444, 0.9694444444444444]

Performance - tanh

loss, train_acc, test_acc = mnist_fnn2_model(
  64,64,10, nl_step=torch.nn.Tanh()
).fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.9846903270702854, 0.9846903270702854, 0.9846903270702854, 0.9853862212943633, 0.9853862212943633]
test_acc[-5:]
[0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222]

Convolutional NN

2d convolutions

nn.Conv2d()

cv = torch.nn.Conv2d(
  in_channels=1, out_channels=4, 
  kernel_size=3, 
  stride=1, padding=1
)
list(cv.parameters())[0] # kernel weights
Parameter containing:
tensor([[[[-0.1167, -0.1832,  0.0059],
          [ 0.2188, -0.0119, -0.0130],
          [ 0.2176,  0.2637, -0.1447]]],

        [[[-0.0134,  0.1201,  0.0502],
          [-0.1689,  0.0637,  0.0928],
          [-0.0445, -0.1124,  0.1476]]],

        [[[-0.0168, -0.2369,  0.1777],
          [ 0.1322, -0.1596,  0.0169],
          [-0.1928,  0.3038, -0.1910]]],

        [[[ 0.2350, -0.1067,  0.0957],
          [-0.2044,  0.0171,  0.3302],
          [ 0.0489,  0.2266, -0.2085]]]],
       requires_grad=True)
list(cv.parameters())[1] # biases
Parameter containing:
tensor([ 0.1893,  0.2009, -0.2069, -0.2023],
       requires_grad=True)

Applying Conv2d()

X_train[[0]]
tensor([[ 0.,  0.,  0., 10., 11.,  0.,  0.,
          0.,  0.,  0.,  9., 16.,  6.,  0.,
          0.,  0.,  0.,  0., 15., 13.,  0.,
          0.,  0.,  0.,  0.,  0., 14., 10.,
          0.,  0.,  0.,  0.,  0.,  1., 15.,
         12.,  8.,  2.,  0.,  0.,  0.,  0.,
         12., 16., 16., 16., 10.,  1.,  0.,
          0.,  7., 16., 12., 12., 16.,  4.,
          0.,  0.,  0.,  9., 15., 12.,  5.,
          0.]])
X_train[[0]].shape
torch.Size([1, 64])
cv(X_train[[0]])
RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 64]
X_train[[0]].view(1,8,8)
tensor([[[ 0.,  0.,  0., 10., 11.,  0.,  0.,
           0.],
         [ 0.,  0.,  9., 16.,  6.,  0.,  0.,
           0.],
         [ 0.,  0., 15., 13.,  0.,  0.,  0.,
           0.],
         [ 0.,  0., 14., 10.,  0.,  0.,  0.,
           0.],
         [ 0.,  1., 15., 12.,  8.,  2.,  0.,
           0.],
         [ 0.,  0., 12., 16., 16., 16., 10.,
           1.],
         [ 0.,  0.,  7., 16., 12., 12., 16.,
           4.],
         [ 0.,  0.,  0.,  9., 15., 12.,  5.,
           0.]]])
cv(X_train[[0]].view(1,8,8))
tensor([[[ 0.1893, -1.1129,  0.1171,  5.2364,
           7.3098,  3.9011,  0.1893,  0.1893],
         [ 0.1893, -2.0983,  2.0070,  6.8142,
           3.2647,  0.2183,  0.1893,  0.1893],
         [ 0.1893, -1.9791,  0.5317,  5.0533,
           2.2427, -0.5109,  0.1893,  0.1893],
         [ 0.0446, -1.8120, -0.3422,  4.2718,
           5.2909,  2.4572,  0.6244,  0.1893],
         [ 0.1762, -1.6723, -1.5831,  4.2735,
           6.9118,  8.1689,  6.6000,  2.6286],
         [ 0.1951, -1.0753, -3.4250,  2.5202,
           5.3464,  5.5298,  9.5757,  6.9009],
         [ 0.1893,  0.1683, -3.5091, -2.6602,
           2.8636,  3.4284,  2.8088,  3.3799],
         [ 0.1893,  0.2303, -1.1167, -3.7906,
          -2.1711, -0.2415, -1.5528, -1.3167]],

        [[ 0.2009,  1.5291,  2.4786,  0.5452,
          -2.1745, -1.9245,  0.2009,  0.2009],
         [ 0.2009,  3.2494,  2.9933, -0.1189,
          -1.5108, -0.9599,  0.2009,  0.2009],
         [ 0.2009,  4.1102,  4.1491, -1.1490,
          -1.9335,  0.1206,  0.2009,  0.2009],
         [ 0.3484,  4.3539,  4.5158, -1.0015,
          -2.8001, -0.3799,  0.1118,  0.2009],
         [ 0.2936,  4.1297,  5.2972,  0.2167,
          -1.4137, -2.0572, -1.8250, -0.3566],
         [ 0.2511,  3.2202,  6.4151,  1.9810,
           0.6124,  0.0564, -3.5409, -2.5864],
         [ 0.2009,  1.4526,  5.7039,  4.9177,
           1.5717,  1.3543, -0.4949, -2.4836],
         [ 0.2009,  0.5522,  2.6800,  4.5965,
           2.5787,  0.9790,  0.4546, -0.3772]],

        [[-0.2069, -1.9260, -0.3596,  0.3636,
          -1.9020,  0.0902, -0.2069, -0.2069],
         [-0.2069, -2.9197,  2.4785, -0.8254,
          -4.3294,  0.4014, -0.2069, -0.2069],
         [-0.2069, -1.0281,  0.6737, -2.8348,
          -2.1065, -0.3076, -0.2069, -0.2069],
         [-0.3979,  0.1337, -1.4428, -4.0576,
           0.6316, -1.1414, -0.5924, -0.2069],
         [-0.1900,  0.0827, -3.2152, -3.1163,
          -1.3112,  0.3976, -0.1799, -1.8308],
         [-0.0292,  1.0873, -4.2183, -1.3568,
          -3.8465, -2.8079,  2.0789, -0.9140],
         [-0.2069,  2.0436, -2.7719, -2.9114,
          -0.4900, -2.7479, -4.3602, -0.0993],
         [-0.2069,  1.0368,  1.1300, -3.1646,
          -2.1869, -0.2560, -2.6997, -0.7623]],

        [[-0.2023, -2.0791,  1.8030,  6.4152,
           0.0830, -2.1573, -0.2023, -0.2023],
         [-0.2023, -0.3583,  6.8802,  3.8777,
          -1.5585,  1.1559, -0.2023, -0.2023],
         [-0.2023,  2.6928,  6.0051,  0.8864,
           0.7487,  1.2074, -0.2023, -0.2023],
         [-0.4109,  2.9549,  3.9288,  1.0290,
           2.7906,  0.6418, -0.1046, -0.2023],
         [ 0.1279,  3.6054,  2.6587,  2.6771,
           1.5629,  0.5192,  2.2284,  0.5129],
         [-0.1066,  3.6294,  3.3193,  7.3773,
           5.2406,  1.7384,  0.8766, -0.5408],
         [-0.2023,  3.2576,  3.5751,  4.1585,
           5.6158,  8.2529,  3.4468, -0.9169],
         [-0.2023,  0.4676,  3.5542,  5.9911,
           5.8046,  1.6586, -1.0737,  2.1085]]],
       grad_fn=<SqueezeBackward1>)

Pooling

x = torch.tensor(
  [[[0,0,0,0],
    [0,1,2,0],
    [0,3,4,0],
    [0,0,0,0]]],
  dtype=torch.float
)
x.shape
torch.Size([1, 4, 4])
torch.nn.MaxPool2d(
  kernel_size=2, stride=1
)(x)
tensor([[[1., 2., 2.],
         [3., 4., 4.],
         [3., 4., 4.]]])
torch.nn.MaxPool2d(
  kernel_size=3, stride=1, padding=1
)(x)
tensor([[[1., 2., 2., 2.],
         [3., 4., 4., 4.],
         [3., 4., 4., 4.],
         [3., 4., 4., 4.]]])
torch.nn.AvgPool2d(
  kernel_size=2
)(x)
tensor([[[0.2500, 0.5000],
         [0.7500, 1.0000]]])
torch.nn.AvgPool2d(
  kernel_size=2, padding=1
)(x)
tensor([[[0.0000, 0.0000, 0.0000],
         [0.0000, 2.5000, 0.0000],
         [0.0000, 0.0000, 0.0000]]])

Convolutional model

class mnist_conv_model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn  = torch.nn.Conv2d(
          in_channels=1, out_channels=8,
          kernel_size=3, stride=1, padding=1
        )
        self.relu = torch.nn.ReLU()
        self.pool = torch.nn.MaxPool2d(kernel_size=2)
        self.lin  = torch.nn.Linear(8 * 4 * 4, 10)
        
    def forward(self, X):
        out = self.cnn(X.view(-1, 1, 8, 8))
        out = self.relu(out)
        out = self.pool(out)
        out = self.lin(out.view(-1, 8 * 4 * 4))
        return out
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      loss_fn = torch.nn.CrossEntropyLoss()
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = loss_fn(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum().item() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum().item() / len(y_test) )
            
      return (losses, train_acc, test_acc)

Performance

loss, train_acc, test_acc = mnist_conv_model().fit(
  X_train, y_train, X_test, y_test, n=1000
)
train_acc[-5:]
[0.9930410577592206, 0.9930410577592206, 0.9937369519832986, 0.9937369519832986, 0.9937369519832986]
test_acc[-5:]
[0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222, 0.9722222222222222]

Organizing models

class mnist_conv_model2(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torch.nn.Sequential(
          torch.nn.Unflatten(1, (1,8,8)),
          torch.nn.Conv2d(
            in_channels=1, out_channels=8,
            kernel_size=3, stride=1, padding=1
          ),
          torch.nn.ReLU(),
          torch.nn.MaxPool2d(kernel_size=2),
          torch.nn.Flatten(),
          torch.nn.Linear(8 * 4 * 4, 10)
        )
        
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X_train, y_train, X_test, y_test, lr=0.001, n=1000, acc_step=10):
      opt = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9) 
      losses, train_acc, test_acc = [], [], []
      
      for i in range(n):
          opt.zero_grad()
          loss = torch.nn.CrossEntropyLoss()(self(X_train), y_train)
          loss.backward()
          opt.step()
          
          losses.append(loss.item())
          
          if (i+1) % acc_step == 0:
            val, train_pred = torch.max(self(X_train), dim=1)
            val, test_pred  = torch.max(self(X_test), dim=1)
            
            train_acc.append( (train_pred == y_train).sum() / len(y_train) )
            test_acc.append( (test_pred == y_test).sum() / len(y_test) )
            
      return (losses, train_acc, test_acc)

A bit more on non-linear
activation layers

Non-linear functions

df = pd.read_csv("data/gp.csv")
X = torch.tensor(df["x"], dtype=torch.float32).reshape(-1,1)
y = torch.tensor(df["y"], dtype=torch.float32)

Linear regression

class lin_reg(torch.nn.Module):
    def __init__(self, X):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, self.p)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

m1 = lin_reg(X)
loss = m1.fit(X,y, n=2000)

Training loss:

Predictions

Double linear regression

class dbl_lin_reg(torch.nn.Module):
    def __init__(self, X, hidden_dim=10):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

m2 = dbl_lin_reg(X, hidden_dim=10)
loss = m2.fit(X,y, n=2000)

Training loss:

Predictions

Non-linear regression w/ ReLU

class lin_reg_relu(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

Hidden dimensions

Non-linear regression w/ Tanh

class lin_reg_tanh(torch.nn.Module):
    def __init__(self, X, hidden_dim=10):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.Tanh(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Tanh & hidden dimension

Three layers

class three_layers(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results

Five layers

class five_layers(torch.nn.Module):
    def __init__(self, X, hidden_dim=100):
        super().__init__()
        self.n = X.shape[0]
        self.p = X.shape[1]
        self.model = torch.nn.Sequential(
          torch.nn.Linear(self.p, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, hidden_dim),
          torch.nn.ReLU(),
          torch.nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, X):
        return self.model(X)
    
    def fit(self, X, y, n=1000):
      losses = []
      opt = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)
      for i in range(n):
          loss = torch.nn.MSELoss()(self(X).squeeze(), y)
          loss.backward()
          opt.step()
          opt.zero_grad()
          losses.append(loss.item())
      
      return losses

Model results